# Replication materials for Clause Analysis (Van Atteveldt et al., Political Analysis)
#
# Code to get the parsed text for the substantive analyses
# Note that you need proper authorisation to get the data from the AmCAT server
# For copyright reasons, the raw text of newspaper articles cannot be published 

library(amcatr)
conn = amcat.connect("http://preview.amcat.nl")

t_us = amcat.gettokens(conn, project=48, articleset=24009, module="corenlp", only_cached = T, page_size = 10)
t_china = amcat.gettokens(conn, project=48, articleset=24010, module="corenlp", only_cached = T, page_size = 10)

tokens = rbind(t_us, t_china)

meta_us = amcat.getarticlemeta(conn, 24009)
meta_china = amcat.getarticlemeta(conn, 24010)
tokens$country[tokens$aid %in% meta_us$id] = "us"
tokens$country[tokens$aid %in% meta_china$id] = "cn"

# get queries from codebook 568 on AmCAT server
source("functions.r")
queries = get_queries(conn, 568)

tokens$attack = grepl(queries["attack"], tokens$lemma, ignore.case = T)
tokens$actor[grepl(queries["israel"], tokens$lemma, ignore.case = T)] = "Israel"
tokens$actor[grepl(queries["hamas"], tokens$lemma, ignore.case = T)] = "Hamas"
tokens$actor[is.na(tokens$actor)] = ""

# Compute clauses and quotes
library(rsyntax)
tokens = unique_ids(tokens)

tokens = tokens[c("aid", "sentence", "id", "offset", "word", "lemma", "entity", "pos1", "relation", "parent", "country", "attack", "actor")]
saveRDS(tokens, "tokens.rds")

quotes = get_quotes(tokens)
clauses = get_clauses(tokens, quotes=quotes)

extra = tokens[c("id", "country", "attack", "actor")]

quotes = merge(quotes, extra)
clauses = merge(clauses, extra)

saveRDS(quotes, "quotes.rds")

# create annotated bag of words from clauses and tokens
src_actor = unique(quotes[quotes$actor != "" & quotes$quote_role == "source", c("quote_id", "actor")])
nn = table(src_actor$quote_id)
src_actor = subset(src_actor, !(quote_id %in% names(nn[nn>1])))

quote_src = quotes[quotes$quote_role== "quote", c('quote_id', 'id')]
quote_src = merge(quote_src, src_actor)
quote_src = data.frame(id=quote_src$id, source=quote_src$actor)

clauses = merge(clauses, quote_src, all.x=T)
clauses = merge(clauses, tokens[c("aid", "sentence", "id", "lemma", "pos1")])

saveRDS(clauses, file="clauses.rds")

# extract named entities
entities =  get.entities(tokens)
saveRDS(entities, file="entities.rds")
